Loading packages to simulate and manipulate data.
library(MASS)
library(tidyverse)Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.3 ✓ purrr 0.3.4
✓ tibble 3.1.1 ✓ dplyr 1.0.6
✓ tidyr 1.1.3 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.1
── Conflicts ──────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
x dplyr::select() masks MASS::select()
Data generation was performed according to the following conditions:
In this way, 20 dataframes are generated with different amounts of data within each of them, which will have 1,000 replications. In total, 20,000 dataframes with observations for analysis.
Create the variables that indicate the conditions
set.seed(2020)
r <- c(0.12, 0.20, 0.31, 0.50) ## Correlaciones
n <- c(50, 100, 250, 500, 1000) ## Tamaño de muestra
replic <- 1000# Generar las matrices de correlación
sigma <- list()
for (i in seq_along(r)) {
sigma[[i]] <- matrix(data = c(1, rep(r[i], 2), 1),
nrow = 2,
ncol = 2)
}
# Generar los datos de correlación
df_cor <- list()
for (i in seq_along(sigma)) {
df_cor[[i]] <- list()
for (j in seq_along(n)) {
df_cor[[i]][[j]] <- list()
for (k in 1:replic) {
df_cor[[i]][[j]][[k]] <- mvrnorm(n = n[j],
mu = rep(0, 2),
Sigma = sigma[[i]]) %>%
as_tibble()
}
}
}The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
Using compatibility `.name_repair`.
We will assemble the list object in such a way that we can identify by columns the correlation, sample size and replicate number of each observation.
Five percent of the data in each data frame will be randomly replaced by outlier correlations different from the one generated. For example, in the case of a dataframe with a correlation of 0.12 and a sample size of 50, 3 pairs of correlations will be randomly replaced with 3 pairs of correlations generated with correlations of 0.20, 0.31 and 0.50.
# Add the number of outliers to be added and their locations Outliers at 5%.
df_work <- df_nest %>%
rowwise() %>%
mutate(
ratio_outlier = map_dbl(n,
~ ceiling(0.05*n)),
posici_rep_m3 = map2(n, ratio_outlier,
~ sample(1:.x, .y)),
posici_rep_m6 = map2(n, ratio_outlier,
~ sample(1:.x, .y)),
posici_rep_m3v1 = map2(n, ratio_outlier,
~ sample(1:.x, .y)),
posici_rep_m3v2 = map2(n, ratio_outlier,
~ sample(1:.x, .y)),
posici_rep_m3v3 = map2(n, ratio_outlier,
~ sample(1:.x, .y)),
posici_rep_m6v1 = map2(n, ratio_outlier,
~ sample(1:.x, .y)),
posici_rep_m6v2 = map2(n, ratio_outlier,
~ sample(1:.x, .y)),
posici_rep_m6v3 = map2(n, ratio_outlier,
~ sample(1:.x, .y))
)
# Add correlations that were not considered
df_work <- df_work %>%
mutate(
correla_a = r[which(correlacion != r)[1]],
correla_b = r[which(correlacion != r)[2]],
correla_c = r[which(correlacion != r)[3]]
)On the correlations identified to be generated, the data matrix necessary for the multivariate simulation is created as an outlier.
df_work <- df_work %>%
mutate(
matrix = map(correlacion,
~ matrix(data = rep(c(1, rep(.x, 2)), 2),
nrow = 2,
ncol = 2)),
matrix_a = map(correla_a,
~ matrix(data = rep(c(1, rep(.x, 2)), 2),
nrow = 2,
ncol = 2)),
matrix_b = map(correla_b,
~ matrix(data = rep(c(1, rep(.x, 2)), 2),
nrow = 2,
ncol = 2)),
matrix_c = map(correla_c,
~ matrix(data = rep(c(1, rep(.x, 2)), 2),
nrow = 2,
ncol = 2))
) %>%
ungroup()Aggregate outliers are of 2 types: they vary only by the mean and they vary by the mean and its matrix:
set.seed(2019)
df_work <- df_work %>%
mutate(
outlier_m3 = map2(ratio_outlier, matrix,
~ mvrnorm(n = .x,
mu = rep(3, 2),
Sigma = .y) %>%
as_tibble()),
outlier_m6 = map2(ratio_outlier, matrix,
~ mvrnorm(n = .x,
mu = rep(6, 2),
Sigma = .y) %>%
as_tibble()),
outlier_m3v1 = map2(ratio_outlier, matrix_a,
~ mvrnorm(n = .x,
mu = rep(3, 2),
Sigma = .y) %>%
as_tibble()),
outlier_m3v2 = map2(ratio_outlier, matrix_b,
~ mvrnorm(n = .x,
mu = rep(3, 2),
Sigma = .y) %>%
as_tibble()),
outlier_m3v3 = map2(ratio_outlier, matrix_c,
~ mvrnorm(n = .x,
mu = rep(3, 2),
Sigma = .y) %>%
as_tibble()),
outlier_m6v1 = map2(ratio_outlier, matrix_a,
~ mvrnorm(n = .x,
mu = rep(6, 2),
Sigma = .y) %>%
as_tibble()),
outlier_m6v2 = map2(ratio_outlier, matrix_b,
~ mvrnorm(n = .x,
mu = rep(6, 2),
Sigma = .y) %>%
as_tibble()),
outlier_m6v3 = map2(ratio_outlier, matrix_c,
~ mvrnorm(n = .x,
mu = rep(6, 2),
Sigma = .y) %>%
as_tibble())
)These new simulated outlier correlations will be inserted into the initially calculated random positions in each dataframe.
df_work <- df_work %>%
mutate(
data_out_m3 = pmap(list(data, posici_rep_m3, outlier_m3),
~ ..1 %>%
slice(- ..2) %>%
bind_rows(..3)),
data_out_m6 = pmap(list(data, posici_rep_m6, outlier_m6),
~ ..1 %>%
slice(- ..2) %>%
bind_rows(..3)),
data_out_m3v1 = pmap(list(data, posici_rep_m3v1, outlier_m3v1),
~ ..1 %>%
slice(- ..2) %>%
bind_rows(..3)),
data_out_m3v2 = pmap(list(data, posici_rep_m3v2, outlier_m3v2),
~ ..1 %>%
slice(- ..2) %>%
bind_rows(..3)),
data_out_m3v3 = pmap(list(data, posici_rep_m3v3, outlier_m3v3),
~ ..1 %>%
slice(- ..2) %>%
bind_rows(..3)),
data_out_m6v1 = pmap(list(data, posici_rep_m6v1, outlier_m6v1),
~ ..1 %>%
slice(- ..2) %>%
bind_rows(..3)),
data_out_m6v2 = pmap(list(data, posici_rep_m6v2, outlier_m6v2),
~ ..1 %>%
slice(- ..2) %>%
bind_rows(..3)),
data_out_m6v3 = pmap(list(data, posici_rep_m6v3, outlier_m6v3),
~ ..1 %>%
slice(- ..2) %>%
bind_rows(..3))
)Additionally, new dataframes with the same correlation conditions, sample size and number of replications with non-normal data distributions are generated using the algorithm of Vale and Maurelli (1983).
Non-normality conditions were generated on the basis of the work of Sheng & Sheng (2012):
set.seed(2021)
library(semTools)
df_work <- df_work %>%
mutate(
data_nonorm1 = map2(n, matrix,
~ mvrnonnorm(n = .x,
mu = rep(0, 2),
Sigma = .y,
skewness = c(0),
kurtosis = c(-1.385)) %>% # symmetric platykurtic distribution
as_tibble()),
data_nonorm2 = map2(n, matrix,
~ mvrnonnorm(n = .x,
mu = rep(0, 2),
Sigma = .y,
skewness = 0,
kurtosis = 25) %>% # symmetric leptokurtic distribution
as_tibble()),
data_nonorm3 = map2(n, matrix,
~ mvrnonnorm(n = .x,
mu = rep(0, 2),
Sigma = .y,
skewness = 0.96,
kurtosis = 0.13) %>% # non-symmetric distribution
as_tibble()),
data_nonorm4 = map2(n, matrix,
~ mvrnonnorm(n = .x,
mu = rep(0, 2),
Sigma = .y,
skewness = 0.48,
kurtosis = -0.92) %>% # non-symmetric platykurtic distribution
as_tibble()),
data_nonorm5 = map2(n, matrix,
~ mvrnonnorm(n = .x,
mu = rep(0, 2),
Sigma = .y,
skewness = 2.5,
kurtosis = 25) %>% # non-symmetric leptokurtic distribution
as_tibble())
)Finally, the variables that will no longer be used are eliminated.
df_work <- df_work %>%
select(-c(ratio_outlier:outlier_m6v3))
df_work